#include "statsxx/machine_learning/Ensemble.hpp" // this

// STL
#include <fstream>             // std::ofstream
#include <functional>          // std::function<>, ::bind(), ::placeholders::_x, ::cref
#include <string>              // std::string
#include <vector>              // std::vector<>

// jScience
#include "jScience/linalg.hpp" // Matrix<>, Vector<>, normalize()
#include "jminimization.h"     // sim_anneal()
#include "jrandnum.hpp"        // rand_num_normal_Mersenne_twister()

// stats++
#include "statsxx/machine_learning/loss_functions.hpp" // quadratic()
#include "statsxx/statistics.hpp"                      // statistics::sum()


static double Ensemble_gen_err(
                               const std::vector<double> &w,
                               // -----
                               const Vector<double>      &E,
                               const Matrix<double>      &C,
                               const Vector<double>      &C_ii
                               );


//
// DESC: Minimize the ensemble generalization error.
//
// INPUT:
//
//     double eps :: convergence criterion (of simulated annealing)
//
// -----
//
// NOTE: The ensemble generalization error is defined as:
//
//     E = Emean - Amean
//
//     where Emean is the weighted w_i average of the generalization errors of the individual i learners E_i:
//
//     Emean = sum_i w_i E_i
//
//     and Amean is the weighted average of the ambiguities A_i (ensemble ambiguity):
//
//     Amean = sum_i w_i A_i
//
// -----
//
// NOTE: Following the Krogh paper (see below), (generalization) errors are calculated as quadratic errors.
//
// NOTE: ... This should work for both regression and classification problems.
//
// -----
//
// NOTE: Simulated annealing (with default settings) is used for minimization, which should be robust.
//
// -----
//
// NOTE: See:
//
//     A. Krogh and J. Vedelsby, ``Neural Network Ensembles, Cross Validation, and Active Learning,'' pp. 231--238.
//
template<typename T>
inline void Ensemble<T>::optimize_Ensemble_gen_err(
                                                   const Matrix<double> &X_in,
                                                   const Matrix<double> &X_out,
                                                   // -----
                                                   const double          eps,
                                                   // -----
                                                   const std::string     prefix
                                                   )
{
    //=========================================================
    // INITIALIZATION
    //=========================================================

    // SETUP OUTPUT FILE
    std::ofstream ofs( (prefix + ".Ensemble_gen_err.dat") );

    // CALCULATE PROBABILITY OF EACH TRAINING EXAMPLE
    //
    // NOTE: Uniform probabilities are assumed, just as in the calculation of loss_function::quadratic().
    std::vector<double> p(X_in.size(0), (1./X_in.size(0)));

    // CALCULATE OUTPUT FROM INDIVIDUAL LEARNERS
    //
    // NOTE: This is used in several instances below.
    //
    // TODO: NOTE: See TODO: NOTE: in ::optimize_BMA().

    std::vector<Matrix<double>> X_out_i(this->learners.size(), Matrix<double>(X_in.size(0),X_out.size(1)));

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        for( auto j = 0; j < X_in.size(0); ++j )
        {
            std::vector<double> output = this->learners[i].f_evaluate(X_in.row(j).std_vector());

            for( auto k = 0; k < X_out.size(1); ++k )
            {
                X_out_i[i](j,k) = output[k];
            }
        }
    }

    // CALCULATE INITIAL PREDICTIVE ACCURACY
    Matrix<double> output = this->combine_output(
                                                 X_out_i
                                                 );

    double qerr = loss_function::quadratic(
                                           output,
                                           X_out
                                           );

    ofs << "Initial quadratic error: " << qerr << '\n';

    //=========================================================
    // QUANTITIES NEEDED TO CALCULATE (ENSEMBLE) GENERALIZATION ERROR
    //=========================================================

    // GENERALIZATION ERRORS OF INDIVIDUAL LEARNERS
    //
    // NOTE: Eq. (7)
    Vector<double> E(this->learners.size());

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        E(i) = loss_function::quadratic(
                                        X_out_i[i],
                                        X_out
                                        );
    }

    // CORRELATION MATRIX (AND ITS DIAGONAL)
    //
    // Eq. (13)
    Matrix<double> C(this->learners.size(),this->learners.size(), 0.);
    // NOTE: The diagonal is also stored (separately), since it is used (separately) below.
    Vector<double> C_ii(this->learners.size());

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        for( auto j = 0; j <= i; ++j )
        {
            for( auto k = 0; k < X_in.size(0); ++k )
            {
                C(i,j) += ( p[k]*dot_product(X_out_i[i].row(k), X_out_i[j].row(k)) );
            }

            C(j,i) = C(i,j);
        }

        C_ii(i) = C(i,i);
    }

    //=========================================================
    // MINIMIZE (ENSEMBLE) GENERALIZATION ERROR, BY SIMULATED ANNEALING
    //=========================================================

    // NOTE: Defaults are used for all simulated annealing parameters.
    //
    // NOTE: This should lead to a r

    int    xtype[this->w.size()];

    double a[this->w.size()];
    double b[this->w.size()];

    double v[this->w.size()];

    for( auto i = 0; i < this->w.size(); ++i )
    {
        xtype[i] = 1;

        a[i] = 0.;
        b[i] = 1.;

        // NOTE: The initial step vector is taken to be normally distributed with a mean of 0 and standard deviation of a fraction of the uniform weight size.
        v[i] = rand_num_normal_Mersenne_twister(0., (0.1*(1./this->learners.size())));
    }

    double _T   = -1.;

    int    N_T  = -1;
    int    N_S  = -1;

    std::function<double(const std::vector<double> &)> func = std::bind(
                                                                        Ensemble_gen_err,
                                                                        std::placeholders::_1,
                                                                        // -----
                                                                        std::cref(E),
                                                                        std::cref(C),
                                                                        std::cref(C_ii)
                                                                        );

    bool silent = false;

    // TODO: NOTE: sim_anneal() is given access directly to the Ensemble weights (this->w).
    //
    // TODO: NOTE: ... This may be dangerous.
    sim_anneal(
               this->w,        // starting vector in R^n
               this->w.size(), // number of elements of x
               xtype,          // data type of x (0 == int or else == double)
               a,              // min range of x
               b,              // max range of x
               func,           // function to be minimized
               _T,             // temp for simulated annealing (< 0 for default)
               v,              // initial step vector
               eps,            // convergence criterion
               N_T,            // number of steps at a given T (< 0 for default)
               N_S,            // number of steps for a given step-size (< 0 for default)
               silent          // silent mode (do not provide output to a file)
               );

    // (RE-)NORMALIZE WEIGHTS
    this->w = normalize(Vector<double>(this->w)).std_vector();

    //=========================================================
    // POST PROCESS
    //=========================================================

    // CALCULATE FINAL PREDICTIVE ACCURACY
    output = this->combine_output(
                                  X_out_i
                                  );

    qerr = loss_function::quadratic(
                                    output,
                                    X_out
                                    );

    // NOTE: The spacing in the following is purposeful (to align with the initial output above).
    ofs << "Final quadratic error  : " << qerr << '\n';

    //=========================================================
    // CLEANUP AND RETURN
    //=========================================================

    ofs.close();
}


//
// DESC: Calculate the ensemble generalization error.
//
// NOTE: This is the function to be minimized by simulated annealing.
//
static double Ensemble_gen_err(
                               const std::vector<double> &w,
                               // -----
                               const Vector<double>      &E,
                               const Matrix<double>      &C,
                               const Vector<double>      &C_ii
                               )
{
    double _E;

    // RENORMALIZE WEIGHTS (IN Vector<> FORM)
    Vector<double> _w = normalize(Vector<double>(w));

    // CALCULATE Ensemble GENERALIZATION ERROR
    //
    // NOTE: Eq. (14)
    _E = dot_product(_w, (E + (C*_w) - C_ii));

    return _E;
}
